# Librería para manejar las contracciones que se presentan en el inglés.
!pip install contractions
Collecting contractions Downloading contractions-0.1.68-py2.py3-none-any.whl (8.1 kB) Collecting textsearch>=0.0.21 Downloading textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB) Collecting pyahocorasick Downloading pyahocorasick-1.4.4-cp39-cp39-win_amd64.whl (39 kB) Collecting anyascii Downloading anyascii-0.3.0-py3-none-any.whl (284 kB) Installing collected packages: pyahocorasick, anyascii, textsearch, contractions Successfully installed anyascii-0.3.0 contractions-0.1.68 pyahocorasick-1.4.4 textsearch-0.0.21
!pip install plotly
Collecting plotly Downloading plotly-5.6.0-py2.py3-none-any.whl (27.7 MB) Requirement already satisfied: six in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from plotly) (1.16.0) Collecting tenacity>=6.2.0 Downloading tenacity-8.0.1-py3-none-any.whl (24 kB) Installing collected packages: tenacity, plotly Successfully installed plotly-5.6.0 tenacity-8.0.1
# librería para manejar las flexiones gramaticales en el idioma inglés.
!pip install inflect
!pip install pandas-profiling==2.7.1
Collecting inflect Downloading inflect-5.4.0-py3-none-any.whl (33 kB) Installing collected packages: inflect Successfully installed inflect-5.4.0 Collecting pandas-profiling==2.7.1 Downloading pandas_profiling-2.7.1-py2.py3-none-any.whl (252 kB) Requirement already satisfied: tqdm>=4.43.0 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from pandas-profiling==2.7.1) (4.62.3) Collecting visions[type_image_path]==0.4.1 Downloading visions-0.4.1-py3-none-any.whl (58 kB) Collecting phik>=0.9.10 Downloading phik-0.12.2-cp39-cp39-win_amd64.whl (685 kB) Collecting htmlmin>=0.1.12 Downloading htmlmin-0.1.12.tar.gz (19 kB) Collecting tangled-up-in-unicode>=0.0.4 Downloading tangled_up_in_unicode-0.2.0-py3-none-any.whl (4.7 MB) Requirement already satisfied: astropy>=4.0 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from pandas-profiling==2.7.1) (4.3.1) Requirement already satisfied: joblib in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from pandas-profiling==2.7.1) (1.1.0) Requirement already satisfied: requests>=2.23.0 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from pandas-profiling==2.7.1) (2.26.0) Requirement already satisfied: numpy>=1.16.0 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from pandas-profiling==2.7.1) (1.22.2) Requirement already satisfied: scipy>=1.4.1 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from pandas-profiling==2.7.1) (1.7.1) Requirement already satisfied: pandas!=1.0.0,!=1.0.1,!=1.0.2,>=0.25.3 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from pandas-profiling==2.7.1) (1.3.4) Requirement already satisfied: jinja2>=2.11.1 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from pandas-profiling==2.7.1) (2.11.3) Requirement already satisfied: ipywidgets>=7.5.1 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from pandas-profiling==2.7.1) (7.6.5) Collecting missingno>=0.4.2 Downloading missingno-0.5.1-py3-none-any.whl (8.7 kB) Requirement already satisfied: matplotlib>=3.2.0 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from pandas-profiling==2.7.1) (3.4.3) Collecting confuse>=1.0.0 Downloading confuse-1.7.0-py2.py3-none-any.whl (25 kB) Requirement already satisfied: networkx>=2.4 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from visions[type_image_path]==0.4.1->pandas-profiling==2.7.1) (2.6.3) Requirement already satisfied: attrs>=19.3.0 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from visions[type_image_path]==0.4.1->pandas-profiling==2.7.1) (21.2.0) Collecting imagehash Downloading ImageHash-4.2.1.tar.gz (812 kB) Requirement already satisfied: Pillow in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from visions[type_image_path]==0.4.1->pandas-profiling==2.7.1) (8.4.0) Requirement already satisfied: pyerfa>=1.7.3 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from astropy>=4.0->pandas-profiling==2.7.1) (2.0.0) Requirement already satisfied: pyyaml in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from confuse>=1.0.0->pandas-profiling==2.7.1) (6.0) Requirement already satisfied: ipykernel>=4.5.1 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from ipywidgets>=7.5.1->pandas-profiling==2.7.1) (6.4.1) Requirement already satisfied: traitlets>=4.3.1 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from ipywidgets>=7.5.1->pandas-profiling==2.7.1) (5.1.0) Requirement already satisfied: ipython>=4.0.0 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from ipywidgets>=7.5.1->pandas-profiling==2.7.1) (7.29.0) Requirement already satisfied: widgetsnbextension~=3.5.0 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from ipywidgets>=7.5.1->pandas-profiling==2.7.1) (3.5.1) Requirement already satisfied: jupyterlab-widgets>=1.0.0 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from ipywidgets>=7.5.1->pandas-profiling==2.7.1) (1.0.0) Requirement already satisfied: ipython-genutils~=0.2.0 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from ipywidgets>=7.5.1->pandas-profiling==2.7.1) (0.2.0) Requirement already satisfied: nbformat>=4.2.0 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from ipywidgets>=7.5.1->pandas-profiling==2.7.1) (5.1.3) Requirement already satisfied: jupyter-client<8.0 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from ipykernel>=4.5.1->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (6.1.12) Requirement already satisfied: tornado<7.0,>=4.2 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from ipykernel>=4.5.1->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (6.1) Requirement already satisfied: debugpy<2.0,>=1.0.0 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from ipykernel>=4.5.1->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (1.4.1) Requirement already satisfied: matplotlib-inline<0.2.0,>=0.1.0 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from ipykernel>=4.5.1->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (0.1.2) Requirement already satisfied: decorator in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (5.1.0) Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (3.0.20) Requirement already satisfied: backcall in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (0.2.0) Requirement already satisfied: pickleshare in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (0.7.5) Requirement already satisfied: pygments in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (2.10.0) Requirement already satisfied: setuptools>=18.5 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (58.0.4) Requirement already satisfied: jedi>=0.16 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (0.18.0) Requirement already satisfied: colorama in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (0.4.4) Requirement already satisfied: parso<0.9.0,>=0.8.0 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from jedi>=0.16->ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (0.8.2) Requirement already satisfied: MarkupSafe>=0.23 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from jinja2>=2.11.1->pandas-profiling==2.7.1) (1.1.1) Requirement already satisfied: pyzmq>=13 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from jupyter-client<8.0->ipykernel>=4.5.1->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (22.2.1) Requirement already satisfied: python-dateutil>=2.1 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from jupyter-client<8.0->ipykernel>=4.5.1->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (2.8.2) Requirement already satisfied: jupyter-core>=4.6.0 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from jupyter-client<8.0->ipykernel>=4.5.1->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (4.8.1) Requirement already satisfied: pywin32>=1.0 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from jupyter-core>=4.6.0->jupyter-client<8.0->ipykernel>=4.5.1->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (228) Requirement already satisfied: cycler>=0.10 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from matplotlib>=3.2.0->pandas-profiling==2.7.1) (0.10.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from matplotlib>=3.2.0->pandas-profiling==2.7.1) (1.3.1) Requirement already satisfied: pyparsing>=2.2.1 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from matplotlib>=3.2.0->pandas-profiling==2.7.1) (3.0.4) Requirement already satisfied: six in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from cycler>=0.10->matplotlib>=3.2.0->pandas-profiling==2.7.1) (1.16.0) Requirement already satisfied: seaborn in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from missingno>=0.4.2->pandas-profiling==2.7.1) (0.11.2) Requirement already satisfied: jsonschema!=2.5.0,>=2.4 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from nbformat>=4.2.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (3.2.0) Requirement already satisfied: pyrsistent>=0.14.0 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (0.18.0) Requirement already satisfied: pytz>=2017.3 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from pandas!=1.0.0,!=1.0.1,!=1.0.2,>=0.25.3->pandas-profiling==2.7.1) (2021.3) Requirement already satisfied: wcwidth in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython>=4.0.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (0.2.5) Requirement already satisfied: idna<4,>=2.5 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from requests>=2.23.0->pandas-profiling==2.7.1) (3.2) Requirement already satisfied: charset-normalizer~=2.0.0 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from requests>=2.23.0->pandas-profiling==2.7.1) (2.0.4) Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from requests>=2.23.0->pandas-profiling==2.7.1) (1.26.7) Requirement already satisfied: certifi>=2017.4.17 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from requests>=2.23.0->pandas-profiling==2.7.1) (2021.10.8) Requirement already satisfied: notebook>=4.4.1 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (6.4.5) Requirement already satisfied: nbconvert in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (6.1.0) Requirement already satisfied: terminado>=0.8.3 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (0.9.4) Requirement already satisfied: argon2-cffi in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (20.1.0) Requirement already satisfied: Send2Trash>=1.5.0 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (1.8.0) Requirement already satisfied: prometheus-client in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (0.11.0) Requirement already satisfied: pywinpty>=0.5 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from terminado>=0.8.3->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (0.5.7) Requirement already satisfied: cffi>=1.0.0 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (1.14.6) Requirement already satisfied: pycparser in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from cffi>=1.0.0->argon2-cffi->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (2.20) Requirement already satisfied: PyWavelets in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from imagehash->visions[type_image_path]==0.4.1->pandas-profiling==2.7.1) (1.1.1) Requirement already satisfied: pandocfilters>=1.4.1 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (1.4.3) Requirement already satisfied: entrypoints>=0.2.2 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (0.3) Requirement already satisfied: testpath in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (0.5.0) Requirement already satisfied: nbclient<0.6.0,>=0.5.0 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (0.5.3) Requirement already satisfied: defusedxml in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (0.7.1) Requirement already satisfied: jupyterlab-pygments in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (0.1.2) Requirement already satisfied: bleach in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (4.0.0) Requirement already satisfied: mistune<2,>=0.8.1 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (0.8.4) Requirement already satisfied: async-generator in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from nbclient<0.6.0,>=0.5.0->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (1.10) Requirement already satisfied: nest-asyncio in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from nbclient<0.6.0,>=0.5.0->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (1.5.1) Requirement already satisfied: webencodings in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from bleach->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (0.5.1) Requirement already satisfied: packaging in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from bleach->nbconvert->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets>=7.5.1->pandas-profiling==2.7.1) (21.0) Building wheels for collected packages: htmlmin, imagehash Building wheel for htmlmin (setup.py): started Building wheel for htmlmin (setup.py): finished with status 'done' Created wheel for htmlmin: filename=htmlmin-0.1.12-py3-none-any.whl size=27098 sha256=29baad235aef796ec697bb8c7e3ee5d162b7fd1f49175406e72516cab90c519a Stored in directory: c:\users\user.desktop-udhiarp\appdata\local\pip\cache\wheels\1d\05\04\c6d7d3b66539d9e659ac6dfe81e2d0fd4c1a8316cc5a403300 Building wheel for imagehash (setup.py): started Building wheel for imagehash (setup.py): finished with status 'done' Created wheel for imagehash: filename=ImageHash-4.2.1-py2.py3-none-any.whl size=295207 sha256=3b100a422ab3485d55c404525142901e9fc304d72ddb70f1b14613415e332941 Stored in directory: c:\users\user.desktop-udhiarp\appdata\local\pip\cache\wheels\51\f9\a5\740af2fdb0ad1edf79aabdc41531be0b6f0b2e2be684c388cf Successfully built htmlmin imagehash Installing collected packages: tangled-up-in-unicode, visions, imagehash, phik, missingno, htmlmin, confuse, pandas-profiling Successfully installed confuse-1.7.0 htmlmin-0.1.12 imagehash-4.2.1 missingno-0.5.1 pandas-profiling-2.7.1 phik-0.12.2 tangled-up-in-unicode-0.2.0 visions-0.4.1
!pip install scikit-plot
Collecting scikit-plot Downloading scikit_plot-0.3.7-py3-none-any.whl (33 kB) Requirement already satisfied: matplotlib>=1.4.0 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from scikit-plot) (3.4.3) Requirement already satisfied: joblib>=0.10 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from scikit-plot) (1.1.0) Requirement already satisfied: scikit-learn>=0.18 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from scikit-plot) (0.24.2) Requirement already satisfied: scipy>=0.9 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from scikit-plot) (1.7.1) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from matplotlib>=1.4.0->scikit-plot) (1.3.1) Requirement already satisfied: pyparsing>=2.2.1 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from matplotlib>=1.4.0->scikit-plot) (3.0.4) Requirement already satisfied: numpy>=1.16 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from matplotlib>=1.4.0->scikit-plot) (1.22.2) Requirement already satisfied: pillow>=6.2.0 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from matplotlib>=1.4.0->scikit-plot) (8.4.0) Requirement already satisfied: cycler>=0.10 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from matplotlib>=1.4.0->scikit-plot) (0.10.0) Requirement already satisfied: python-dateutil>=2.7 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from matplotlib>=1.4.0->scikit-plot) (2.8.2) Requirement already satisfied: six in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from cycler>=0.10->matplotlib>=1.4.0->scikit-plot) (1.16.0) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\user.desktop-udhiarp\anaconda3\lib\site-packages (from scikit-learn>=0.18->scikit-plot) (2.2.0) Installing collected packages: scikit-plot Successfully installed scikit-plot-0.3.7
# librería Natural Language Toolkit, usada para trabajar con textos
import nltk
# Punkt permite separar un texto en frases.
nltk.download('punkt')
[nltk_data] Downloading package punkt to C:\Users\USER.DESKTOP- [nltk_data] UDHIARP\AppData\Roaming\nltk_data... [nltk_data] Unzipping tokenizers\punkt.zip.
True
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to C:\Users\USER.DESKTOP- [nltk_data] UDHIARP\AppData\Roaming\nltk_data... [nltk_data] Unzipping corpora\stopwords.zip.
True
# Descarga de paquete WordNetLemmatizer, este es usado para encontrar el lema de cada palabra
# ¿Qué es el lema de una palabra? ¿Qué tan dificil puede ser obtenerlo, piensa en el caso en que tuvieras que escribir la función que realiza esta tarea?
nltk.download('wordnet')
[nltk_data] Downloading package wordnet to C:\Users\USER.DESKTOP- [nltk_data] UDHIARP\AppData\Roaming\nltk_data... [nltk_data] Unzipping corpora\wordnet.zip.
True
# Instalación de librerias
import pandas as pd
import numpy as np
import sys
from pandas_profiling import ProfileReport
import re, string, unicodedata
import contractions
import inflect
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
import plotly.express as px
from sklearn.metrics import classification_report, confusion_matrix, plot_precision_recall_curve,accuracy_score
from sklearn.base import BaseEstimator, ClassifierMixin
import scikitplot.metrics as skplt
from sklearn.metrics import roc_auc_score
from sklearn import metrics
import matplotlib.pyplot as plt
# Uso de la libreria pandas para la lectura de archivos
cl_data=pd.read_csv('clinical_trials_on_cancer_data_clasificacion.csv', sep=',', encoding = 'utf-8')
# Asignación a una nueva variable de los datos leidos
cl_data_t=cl_data
cl_data_t.head()
| label | study_and_condition | |
|---|---|---|
| 0 | __label__0 | study interventions are Saracatinib . recurren... |
| 1 | __label__1 | study interventions are Stem cell transplantat... |
| 2 | __label__0 | study interventions are Lenograstim . recurren... |
| 3 | __label__0 | study interventions are Doxorubicin . stage ii... |
| 4 | __label__1 | study interventions are Poly I-C . prostate ca... |
textos = cl_data_t.copy()
textos['Conteo'] = [len(x) for x in textos['study_and_condition']]
textos['Moda'] = [max(set(x.split(' ')), key = x.split(' ').count) for x in textos['study_and_condition']]
textos['Max'] = [[max([len(x) for x in i.split(' ')])][0] for i in textos['study_and_condition']]
textos['Min'] = [[min([len(x) for x in i.split(' ')])][0] for i in textos['study_and_condition']]
# Se realiza un perfilamiento de los datos con la librería pandas profiling
ProfileReport(textos)
Se realiza la primera limpieza de datos removiendo o transformado palabras, caracteres, etc para obtener una mejor precision a la hora de realizar el modelo
def remove_non_ascii(words):
"""Remove non-ASCII characters from list of tokenized words"""
new_words = []
for word in words:
new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
new_words.append(new_word)
return new_words
def to_lowercase(words):
"""Convert all characters to lowercase from list of tokenized words"""
new_words = []
for word in words:
new_word = word.lower()
new_words.append(new_word)
return new_words
def remove_punctuation(words):
"""Remove punctuation from list of tokenized words"""
new_words = []
for word in words:
new_word = re.sub(r'[^\w\s]', '', word)
if new_word != '':
new_words.append(new_word)
return new_words
def replace_numbers(words):
"""Replace all interger occurrences in list of tokenized words with textual representation"""
p = inflect.engine()
new_words = []
for word in words:
if word.isdigit():
new_word = p.number_to_words(word)
new_words.append(new_word)
else:
new_words.append(word)
return new_words
def remove_stopwords(words):
"""Remove stop words from list of tokenized words"""
new_words = []
for word in words:
if word not in stopwords.words('english'):
new_words.append(word)
return new_words
def preprocessing(words):
words = to_lowercase(words)
words = replace_numbers(words)
words = remove_punctuation(words)
words = remove_non_ascii(words)
words = remove_stopwords(words)
return words
cl_data_t['study_and_condition'] = cl_data_t['study_and_condition'].apply(contractions.fix) #Aplica la corrección de las contracciones
cl_data_t['words'] = cl_data_t['study_and_condition'].apply(word_tokenize).apply(preprocessing) #Aplica la eliminación del ruido
cl_data_t.head()
| label | study_and_condition | words | |
|---|---|---|---|
| 0 | __label__0 | study interventions are Saracatinib . recurren... | [study, interventions, saracatinib, recurrent,... |
| 1 | __label__1 | study interventions are Stem cell transplantat... | [study, interventions, stem, cell, transplanta... |
| 2 | __label__0 | study interventions are Lenograstim . recurren... | [study, interventions, lenograstim, recurrent,... |
| 3 | __label__0 | study interventions are Doxorubicin . stage ii... | [study, interventions, doxorubicin, stage, iii... |
| 4 | __label__1 | study interventions are Poly I-C . prostate ca... | [study, interventions, poly, ic, prostate, can... |
Se realiza la ultima limpieza de datos con la lematizacion de las palabras.
def stem_words(words):
"""Stem words in list of tokenized words"""
stemmer = LancasterStemmer()
stems = []
for word in words:
stem = stemmer.stem(word)
stems.append(stem)
return stems
def lemmatize_verbs(words):
"""Lemmatize verbs in list of tokenized words"""
lemmatizer = WordNetLemmatizer()
lemmas = []
for word in words:
lemma = lemmatizer.lemmatize(word, pos='v')
lemmas.append(lemma)
return lemmas
def stem_and_lemmatize(words):
stems = stem_words(words)
lemmas = lemmatize_verbs(words)
return stems + lemmas
cl_data_t['words'] = cl_data_t['words'].apply(stem_and_lemmatize) #Aplica lematización y Eliminación de Prefijos y Sufijos.
cl_data_t.head()
| label | study_and_condition | words | |
|---|---|---|---|
| 0 | __label__0 | study interventions are Saracatinib . recurren... | [study, interv, saracatinib, recur, verruc, ca... |
| 1 | __label__1 | study interventions are Stem cell transplantat... | [study, interv, stem, cel, transpl, hodgkin, l... |
| 2 | __label__0 | study interventions are Lenograstim . recurren... | [study, interv, lenograstim, recur, adult, dif... |
| 3 | __label__0 | study interventions are Doxorubicin . stage ii... | [study, interv, doxorubicin, stag, ii, diffus,... |
| 4 | __label__1 | study interventions are Poly I-C . prostate ca... | [study, interv, poly, ic, prost, cant, diagnos... |
Se obtiene la grafica final con la nueva columna "words" ya con la limpieza de datos realizada.
cl_data_t['words'] = cl_data_t['words'].apply(lambda x: ' '.join(map(str, x)))
cl_data_t
| label | study_and_condition | words | |
|---|---|---|---|
| 0 | __label__0 | study interventions are Saracatinib . recurren... | study interv saracatinib recur verruc carcinom... |
| 1 | __label__1 | study interventions are Stem cell transplantat... | study interv stem cel transpl hodgkin lymphom ... |
| 2 | __label__0 | study interventions are Lenograstim . recurren... | study interv lenograstim recur adult diffus mi... |
| 3 | __label__0 | study interventions are Doxorubicin . stage ii... | study interv doxorubicin stag ii diffus larg c... |
| 4 | __label__1 | study interventions are Poly I-C . prostate ca... | study interv poly ic prost cant diagnos unreso... |
| ... | ... | ... | ... |
| 11995 | __label__0 | study interventions are Prednisolone hemisucci... | study interv prednisolon hemisuccin recur chil... |
| 11996 | __label__0 | study interventions are Bevacizumab . recurren... | study interv bevacizumab recur rect cant diagn... |
| 11997 | __label__1 | study interventions are Antibodies, Monoclonal... | study interv antibody monoclon recur lymphobla... |
| 11998 | __label__0 | study interventions are Vorinostat . colorecta... | study interv vorinost colorect cant diagnos pa... |
| 11999 | __label__0 | study interventions are Freund's Adjuvant . ov... | study interv freund adjuv ov cant diagnos four... |
12000 rows × 3 columns
fig = px.histogram(cl_data_t, x="label")
fig.update_traces(marker_color="rgb(128,128,255)",marker_line_color='black',
marker_line_width=1.5)
fig.update_layout(title_text='Label Count')
fig.show()
Existen un balanceo exacto, por lo que no se ve necesario utilizar la herramienta de SMOTE.
filename = 'datosProcesados.csv'
cl_data_t.to_csv(filename)
Se crea un nuevo archivo con los datos procesados
cl_rf = cl_data_t.copy()
X = cl_rf['words']
y = cl_rf['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
vectorizer = TfidfVectorizer()
vectorizer_tfidf = vectorizer.fit(X_train)
X_train = vectorizer_tfidf.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
print('Classification report:\n\n', classification_report(preds,y_test))
Classification report:
precision recall f1-score support
__label__0 0.80 0.82 0.81 1198
__label__1 0.81 0.80 0.80 1202
accuracy 0.81 2400
macro avg 0.81 0.81 0.81 2400
weighted avg 0.81 0.81 0.81 2400
acc = accuracy_score(preds, y_test)
print(f"Model Accuracy = {round(acc*100,2)}%")
Model Accuracy = 80.62%
skplt.plot_confusion_matrix(y_test,preds)
<matplotlib.axes._subplots.AxesSubplot at 0x7f5731611650>
data_rl = cl_data_t.copy()
X_train, X_test, y_train, y_test = train_test_split(data_rl['words'], data_rl['label'], test_size = 0.2)
cv = CountVectorizer(token_pattern=r'\b\w+\b')
X_train = cv.fit_transform(X_train)
X_test = cv.transform(X_test)
rl = LogisticRegression()
rl.fit(X_train,y_train)
/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py:818: ConvergenceWarning:
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
LogisticRegression()
predictions = rl.predict(X_test)
acc = accuracy_score(predictions, y_test)
print(f"Model Accuracy = {round(acc*100,2)}%")
Model Accuracy = 80.21%
new = np.asarray(y_test)
print('Confusion Matrix:\n\n', confusion_matrix(predictions,y_test))
Confusion Matrix: [[947 227] [248 978]]
print('Classification report:\n\n', classification_report(predictions,y_test))
Classification report:
precision recall f1-score support
__label__0 0.79 0.81 0.80 1174
__label__1 0.81 0.80 0.80 1226
accuracy 0.80 2400
macro avg 0.80 0.80 0.80 2400
weighted avg 0.80 0.80 0.80 2400
skplt.plot_confusion_matrix(y_test,predictions)
<matplotlib.axes._subplots.AxesSubplot at 0x7f2523656610>
data_nb = cl_data_t.copy()
def Text_Into_Vector(model,data):
model_vect = model(ngram_range=(1,2)) #in scikit-learn
final_array = model_vect.fit_transform(data.values)
return model_vect, final_array
def Split_data(x_vec, y_vec):
X_train, X_test, Y_train, Y_test = train_test_split(x_vec, y_vec, test_size=.33, random_state=0)
X_tr, X_cv, Y_tr, Y_cv = train_test_split(X_train, Y_train, test_size=.33, random_state=0)
return X_tr, X_cv, X_test, Y_tr, Y_test, Y_cv, X_train, Y_train
def Normalization(train, cv, test):
train=preprocessing.normalize(train)
cv=preprocessing.normalize(cv)
test=preprocessing.normalize(test)
return train, cv, test
def Multinomial_NB(X_train,X_cv,Y_train,Y_cv):
best_alpha=0
max_roc_auc=-1
pred_cv = []
pred_train = []
alpha=[10000,5000,1000,500,100,50,10,5,1,0.5,0.1,0.05,0.01,0.005,0.001,0.0005,0.0001,0.00005,0.00001]
for i in alpha:
mulbnb = MultinomialNB(alpha=i)
mulbnb.fit(X_train,Y_train)
probs = mulbnb.predict_proba(X_cv)[:,1]
prob = mulbnb.predict_proba(X_train)[:,1]
auc_score_cv = roc_auc_score(Y_cv,probs)
auc_score_train = roc_auc_score(Y_train,prob)
pred_cv.append(auc_score_cv)
pred_train.append(auc_score_train)
if(max_roc_auc<auc_score_cv):
max_roc_auc=auc_score_cv
best_alpha=i
return best_alpha
def Testing_model(X_train,Y_train,X_test,Y_test,best_alpha):
bnb = MultinomialNB(alpha = best_alpha, fit_prior=True, class_prior=None)
bnb.fit(X_train,Y_train)
probs = bnb.predict_proba(X_test)[:,1]
roc_auc = roc_auc_score(Y_test,probs)
prediction=bnb.predict(X_test)
skplt.plot_confusion_matrix(Y_test,prediction)
print("macro f1 score for data :",metrics.f1_score(Y_test, prediction, average = 'macro'))
print("micro f1 score for data:",metrics.f1_score(Y_test, prediction, average = 'micro'))
print("hamming loss for data:",metrics.hamming_loss(Y_test,prediction))
print("\n")
print("Precision recall report for data:\n",metrics.classification_report(Y_test, prediction))
print("\n")
return bnb,roc_auc
BOW, X = Text_Into_Vector(CountVectorizer,data_nb['words'])
X_tr, X_cv, X_test, Y_tr, Y_test, Y_cv, X_train, Y_train = Split_data(X, data_nb['label'])
best_alpha_bow = Multinomial_NB(X_tr,X_cv,Y_tr,Y_cv)
NB_bow, roc_auc_bow = Testing_model(X_tr,Y_tr,X_test,Y_test,best_alpha_bow)
macro f1 score for data : 0.8045091029323634
micro f1 score for data: 0.8045454545454547
hamming loss for data: 0.19545454545454546
Precision recall report for data:
precision recall f1-score support
__label__0 0.80 0.81 0.81 1995
__label__1 0.81 0.80 0.80 1965
accuracy 0.80 3960
macro avg 0.80 0.80 0.80 3960
weighted avg 0.80 0.80 0.80 3960